library(AppliedPredictiveModeling)   # Data sets
library(tidyverse)                   # Oppan tidy style
library(caret)                       # Modeling
library(e1071)                       # skewness

Compute

data(segmentationOriginal)
segmentationOriginal <- as_tibble(segmentationOriginal)
segmentationOriginal
seg_data <- subset(segmentationOriginal, Case == "Train")
seg_data
cell_id <- seg_data$Case
class <- seg_data$Class
case <- seg_data$Case
seg_data <- seg_data[, -(1:3)]
seg_data %>% select(-contains("Status")) -> seg_data
seg_data

Skewness

skewness(seg_data$AngleCh1)
[1] -0.02426252
#seg_data %>% map_dfr(skewness)
summarize_all(seg_data, skewness)

Box-Cox transform

Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1)
Ch1AreaTrans
Box-Cox Transformation

1009 data points used to estimate Lambda

Input data summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  150.0   194.0   256.0   325.1   376.0  2186.0 

Largest/Smallest: 14.6 
Sample Skewness: 3.53 

Estimated Lambda: -0.9 

Apply the transform with the predict function

predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat
dat
[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523

Or perform it all at once via caret::preProcess

pca_object <- prcomp(seg_data, center = TRUE, scale = TRUE)
percent_variance <- pca_object$sdev^2/sum(pca_object$sdev^2)*100
percent_variance[1:3]
[1] 20.91236 17.01330 11.88689

Near zero variance

nearZeroVar(seg_data)
integer(0)

Correlations

correlations <- cor(seg_data)
dim(correlations)
[1] 58 58
correlations[1:4, 1:4]
                AngleCh1      AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1     1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1     -0.002627172  1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394  1.00000000  0.52521711
AvgIntenCh2 -0.019446810 -0.153303007  0.52521711  1.00000000
corrplot::corrplot(correlations, order = "hclust") 

Exercises

3.1

skimr::skim(Glass)
Skim summary statistics
 n obs: 214 
 n variables: 10 

Variable type: factor 
 variable missing complete   n n_unique                 top_counts ordered
     Type       0      214 214        6 2: 76, 1: 70, 7: 29, 3: 17   FALSE

Variable type: numeric 
 variable missing complete   n   mean    sd    p0   p25   p50   p75  p100     hist
       Al       0      214 214  1.44  0.5    0.29  1.19  1.36  1.63  3.5  ▁▂▇▅▂▁▁▁
       Ba       0      214 214  0.18  0.5    0     0     0     0     3.15 ▇▁▁▁▁▁▁▁
       Ca       0      214 214  8.96  1.42   5.43  8.24  8.6   9.17 16.19 ▁▂▇▂▁▁▁▁
       Fe       0      214 214  0.057 0.097  0     0     0     0.1   0.51 ▇▁▁▁▁▁▁▁
        K       0      214 214  0.5   0.65   0     0.12  0.56  0.61  6.21 ▇▁▁▁▁▁▁▁
       Mg       0      214 214  2.68  1.44   0     2.11  3.48  3.6   4.49 ▃▁▁▁▁▁▇▁
       Na       0      214 214 13.41  0.82  10.73 12.91 13.3  13.83 17.38 ▁▁▇▇▃▁▁▁
       RI       0      214 214  1.52  0.003  1.51  1.52  1.52  1.52  1.53 ▁▅▇▂▁▁▁▁
       Si       0      214 214 72.65  0.77  69.81 72.28 72.79 73.09 75.41 ▁▁▂▃▇▁▁▁

Predictor variables

select(Glass, -Type) %>% cor() %>% corrplot::corrplot()

caret::nearZeroVar(Glass)
integer(0)
summarize_if(Glass, .p = is.numeric, .f = e1071::skewness)
map_if(Glass, .p = is.numeric, .f = BoxCoxTrans) %>% map_dfr("lambda", .null=NA)

3.2

LS0tCnRpdGxlOiAiQ2hhcHRlciAzIC0gRGF0YSBQcmUtcHJvY2Vzc2luZyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyIHNldHVwfQpsaWJyYXJ5KEFwcGxpZWRQcmVkaWN0aXZlTW9kZWxpbmcpICAgIyBEYXRhIHNldHMKbGlicmFyeSh0aWR5dmVyc2UpICAgICAgICAgICAgICAgICAgICMgT3BwYW4gdGlkeSBzdHlsZQpsaWJyYXJ5KGNhcmV0KSAgICAgICAgICAgICAgICAgICAgICAgIyBNb2RlbGluZwpsaWJyYXJ5KGUxMDcxKSAgICAgICAgICAgICAgICAgICAgICAgIyBza2V3bmVzcwpgYGAKCiMjIENvbXB1dGUKCmBgYHtyfQpkYXRhKHNlZ21lbnRhdGlvbk9yaWdpbmFsKQpzZWdtZW50YXRpb25PcmlnaW5hbCA8LSBhc190aWJibGUoc2VnbWVudGF0aW9uT3JpZ2luYWwpCnNlZ21lbnRhdGlvbk9yaWdpbmFsCmBgYAoKYGBge3J9CnNlZ19kYXRhIDwtIHN1YnNldChzZWdtZW50YXRpb25PcmlnaW5hbCwgQ2FzZSA9PSAiVHJhaW4iKQpzZWdfZGF0YQpgYGAKCmBgYHtyfQpjZWxsX2lkIDwtIHNlZ19kYXRhJENhc2UKY2xhc3MgPC0gc2VnX2RhdGEkQ2xhc3MKY2FzZSA8LSBzZWdfZGF0YSRDYXNlCnNlZ19kYXRhIDwtIHNlZ19kYXRhWywgLSgxOjMpXQpzZWdfZGF0YSAlPiUgc2VsZWN0KC1jb250YWlucygiU3RhdHVzIikpIC0+IHNlZ19kYXRhCnNlZ19kYXRhCmBgYAoKU2tld25lc3MKCmBgYHtyfQpza2V3bmVzcyhzZWdfZGF0YSRBbmdsZUNoMSkKI3NlZ19kYXRhICU+JSBtYXBfZGZyKHNrZXduZXNzKQpzdW1tYXJpemVfYWxsKHNlZ19kYXRhLCBza2V3bmVzcykKYGBgCgpCb3gtQ294IHRyYW5zZm9ybQpgYGB7cn0KQ2gxQXJlYVRyYW5zIDwtIEJveENveFRyYW5zKHNlZ19kYXRhJEFyZWFDaDEpCkNoMUFyZWFUcmFucwpgYGAKCkFwcGx5IHRoZSB0cmFuc2Zvcm0gd2l0aCB0aGUgYHByZWRpY3RgIGZ1bmN0aW9uCgpgYGB7cn0KcHJlZGljdChDaDFBcmVhVHJhbnMsIGhlYWQoc2VnX2RhdGEkQXJlYUNoMSkpIC0+IGRhdApkYXQKYGBgCgpPciBwZXJmb3JtIGl0IGFsbCBhdCBvbmNlIHZpYSBgY2FyZXQ6OnByZVByb2Nlc3NgCgpgYGB7cn0KcGNhX29iamVjdCA8LSBwcmNvbXAoc2VnX2RhdGEsIGNlbnRlciA9IFRSVUUsIHNjYWxlID0gVFJVRSkKcGVyY2VudF92YXJpYW5jZSA8LSBwY2Ffb2JqZWN0JHNkZXZeMi9zdW0ocGNhX29iamVjdCRzZGV2XjIpKjEwMApwZXJjZW50X3ZhcmlhbmNlWzE6M10KYGBgCgpOZWFyIHplcm8gdmFyaWFuY2UKYGBge3J9Cm5lYXJaZXJvVmFyKHNlZ19kYXRhKQpgYGAKCkNvcnJlbGF0aW9ucwoKYGBge3J9CmNvcnJlbGF0aW9ucyA8LSBjb3Ioc2VnX2RhdGEpCmRpbShjb3JyZWxhdGlvbnMpCmNvcnJlbGF0aW9uc1sxOjQsIDE6NF0KYGBgCgpgYGB7cn0KY29ycnBsb3Q6OmNvcnJwbG90KGNvcnJlbGF0aW9ucywgb3JkZXIgPSAiaGNsdXN0IikgCmBgYAoKIyMgRXhlcmNpc2VzCgojIyMgMy4xIAoKYGBge3J9CmxpYnJhcnkobWxiZW5jaCkKZGF0YShHbGFzcykKI3N0cihHbGFzcykKc2tpbXI6OnNraW0oR2xhc3MpCmBgYAoKUHJlZGljdG9yIHZhcmlhYmxlcwoKYGBge3J9CnNlbGVjdChHbGFzcywgLVR5cGUpICU+JSBjb3IoKSAlPiUgY29ycnBsb3Q6OmNvcnJwbG90KCkKYGBgCgpgYGB7cn0KY2FyZXQ6Om5lYXJaZXJvVmFyKEdsYXNzKQpzdW1tYXJpemVfaWYoR2xhc3MsIC5wID0gaXMubnVtZXJpYywgLmYgPSBlMTA3MTo6c2tld25lc3MpCm1hcF9pZihHbGFzcywgLnAgPSBpcy5udW1lcmljLCAuZiA9IEJveENveFRyYW5zKSAlPiUgbWFwX2RmcigibGFtYmRhIiwgLm51bGw9TkEpCmBgYAoKIyMjIDMuMgo=